# _*_ coding: utf-8 _*_
import codecs
from bs4 import BeautifulSoup
import time, json, math
import sys, os
import asyncio
import aiohttp
import aiofiles

urls = []
f    = codecs.open('dt/goods.txt', 'w', encoding='utf-8', errors='ignore')
semaphore = asyncio.Semaphore(5)
#uvloop加速
#asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())

async def getHtml(url):
	async with semaphore:
		async with aiohttp.ClientSession() as session:
			async with session.get(url) as html:
				if url.endswith('.jpg'):
					img = await html.read()
					imgname = url.replace('http://www.13qh.com/', '')
					imgpath = os.path.dirname(imgname)
					if not os.path.exists(imgpath):
						os.makedirs(imgpath)
					fp  = await aiofiles.open(imgname, 'wb')
					await fp.write(img)
					return True
				else:
					tmp = await html.text(encoding='utf-8')
					return tmp

async def getList(url, **cat):
	tmp = await getHtml(url)
	try:
		htm = BeautifulSoup(tmp, 'lxml')
		ul  = htm.select('.goods-item .goods-pic a')
	except Exception as e:
		print(e)
		ul  = None
	if ul  != None:
		for li in ul:
			link = li.get('href')
			await parse(link, **cat)

async def parse(url, **cat):
	tmp = await getHtml(url)
	try:
		htm = BeautifulSoup(tmp, 'lxml')
		goods_id = url.split('/')[-1]
		goods_name = htm.select('.goods-title h3')[0].text
		goods_name_sub = htm.select('.goods-title p')[0].text
		goods_price = htm.select('.goods-info .sale_price')[0].text
		sale_price  = htm.select('.goods-info ul li')[0].find('del').text
		sale_price  = filter(lambda ch : ch in '.0123456789', sale_price)
		thumb_cont  = htm.select('.thumb-cont ul li')
		print(goods_name)
		goods_thumb = []
		for thumb in thumb_cont:
			img = thumb.find('img').get('big')
			goods_thumb.append(img)
			print(img)
			await getHtml('http://www.13qh.com' + img)
		detail_div  = htm.select('.detail-content p img')
		goods_detail = []
		for p in detail_div:
			goods_detail.append(p.get('src'))
			print(p.get('src'))
			await getHtml('http://www.13qh.com' + p.get('src'))
		goods = {
			'cat_id': cat['lan_id'],
			'sub_id': cat['sub_id'],
			'goods_id': goods_id,
			'goods_name': goods_name,
			'goods_price': goods_price,
			'sale_price' : sale_price,
			'goods_thumb': goods_thumb,
			'goods_detail': goods_detail
		}
		f.write(json.dumps(goods) + os.linesep)
	except Exception as e:
		print(e)

async def caiz():
	url = 'http://www.13qh.com/'
	tmp = await getHtml(url)
	htm = BeautifulSoup(tmp, 'lxml')
	cat = htm.select('.category-content>ul>li')

	category = []
	for li in cat:
		lan = li.select('p a')[0]
		lan_text = lan.text
		lan_id   = lan.get('href').split('/')[-1]
		category.append({'cat_id': lan_id, 'cat_name': lan_text, 'parent_id': 0})

		ul  = li.select('.category-list ul li')
		for u in ul:
			ua = u.select('.a')
			for a in ua:
				sua = a.select('a')
				sua_text = sua.text
				sua_id   = sua.get('href').split('/')[-1]
				category.append({'cat_id': sua_id, 'cat_name': sua_text, 'parent_id': lan_id})

			ub = u.select('.b a')[0]
			sub_text = ub.text
			sub_id   = ub.get('href').split('/')[-1]
			category.append({'cat_id': sub_id, 'cat_name': sub_text, 'parent_id': lan_id})

			uc = u.select('.c a')
			for c in uc:
				suc_text = c.text
				suc_href = c.get('href')
				suc_id   = suc_href.split('/')[-1]
				category.append({'cat_id': suc_id, 'cat_name': suc_text, 'parent_id': sub_id})

				for i in range(1, 20):
					#await getList("%s/page/%s" % (suc_href, i), lan_id = lan_id, sub_id = sub_id)
					asyncio.ensure_future(getList("%s/page/%s" % (suc_href, i), lan_id = lan_id, sub_id = sub_id))
	with codecs.open('dt/category.txt', 'w', encoding='utf-8', errors='ignore') as ff:
		ff.write(json.dumps(category))

def main():
	loop  = asyncio.get_event_loop()
	asyncio.run(caiz())
	#loop.run_until_complete(caiz())
	#loop.close()
	f.close()

if __name__ == '__main__':
	main()